class: center, middle, inverse, title-slide .title[ # Text analysis II: Building a tidytext toolbox ] .subtitle[ ## Introduction to R for Social Sciences (Sociology ∞ HERO) ] .author[ ### Josef Ginnerskov, doctoral candidate ] .institute[ ### Department of Sociology ] .date[ ### 2022-06-03 ] --- # Loading books from www.gutenberg.org ```r library(gutenbergr) sociology_raw <- gutenberg_download(c(41360, 46423, 13205, 30610, 6568, 21609), meta_fields = c("author", "title"), strip = TRUE) head(sociology_raw) ``` ``` ## # A tibble: 6 x 4 ## gutenberg_id text author title ## <int> <chr> <chr> <chr> ## 1 6568 "SOCIOLOGY AND MODERN SOCIAL PROBLEMS" Ellwood, Charles A.~ Soci~ ## 2 6568 "" Ellwood, Charles A.~ Soci~ ## 3 6568 "BY" Ellwood, Charles A.~ Soci~ ## 4 6568 "" Ellwood, Charles A.~ Soci~ ## 5 6568 "CHARLES A. ELLWOOD, PH. D." Ellwood, Charles A.~ Soci~ ## 6 6568 "" Ellwood, Charles A.~ Soci~ ``` --- # Preprocessing - fix awkward signs ```r sociology_raw$text <- gsub("_","",as.character(sociology_raw$text)) sociology_raw$text <- gsub("--"," ",as.character(sociology_raw$text)) ``` --- # Turning books into a tidy tibble ```r library(tidyverse) library(tidytext) tidy_sociology <- sociology_raw %>% mutate(line = row_number()) %>% unnest_tokens(word, text) head(tidy_sociology) ``` ``` ## # A tibble: 6 x 5 ## gutenberg_id author title line word ## <int> <chr> <chr> <int> <chr> ## 1 6568 Ellwood, Charles A. (Charles Abram) Sociology and Mo~ 1 soci~ ## 2 6568 Ellwood, Charles A. (Charles Abram) Sociology and Mo~ 1 and ## 3 6568 Ellwood, Charles A. (Charles Abram) Sociology and Mo~ 1 mode~ ## 4 6568 Ellwood, Charles A. (Charles Abram) Sociology and Mo~ 1 soci~ ## 5 6568 Ellwood, Charles A. (Charles Abram) Sociology and Mo~ 1 prob~ ## 6 6568 Ellwood, Charles A. (Charles Abram) Sociology and Mo~ 3 by ``` --- # Removing stop words ```r tidy_sociology <- tidy_sociology %>% anti_join(stop_words) my_words <- tibble(word = c("p", "cit", "tr", "pp", "ff", "nat", "ibid", "geddes", "prof", "per")) tidy_sociology <- tidy_sociology %>% anti_join(my_words) ``` --- # Basic global/corpus count ```r tidy_sociology %>% count(word, sort = TRUE) %>% top_n(10) ``` ``` ## # A tibble: 10 x 2 ## word n ## <chr> <int> ## 1 social 2471 ## 2 life 1981 ## 3 time 1166 ## 4 society 1164 ## 5 people 1075 ## 6 family 992 ## 7 religious 970 ## 8 individual 942 ## 9 form 915 ## 10 money 912 ``` --- # Basic local/document count ```r tidy_sociology %>% filter(author == "Marx, Karl") %>% count(word, sort = TRUE) %>% top_n(10) ``` ``` ## # A tibble: 10 x 2 ## word n ## <chr> <int> ## 1 money 834 ## 2 gold 649 ## 3 commodities 621 ## 4 circulation 562 ## 5 exchange 545 ## 6 labor 523 ## 7 commodity 434 ## 8 production 414 ## 9 form 335 ## 10 time 328 ``` --- # Generating a word cloud (a) ```r library(wordcloud) tidy_sociology %>% count(word) %>% with(wordcloud(word, n, max.words = 100, colors = brewer.pal(8, "Dark2"))) ``` --- # Generating a word cloud (b) <img src="data:image/png;base64,#R-course-ginnerskov-part3-2022_files/figure-html/unnamed-chunk-14-1.png" width="80%" style="display: block; margin: auto;" /> --- # Comparing word occurences per book ```r words_sociology <- tidy_sociology %>% count(author, word, sort = TRUE) words_sociology_work <- words_sociology %>% group_by(author) %>% summarize(total = sum(n)) words_sociology_work <- left_join(words_sociology_work, words_sociology) ``` ``` ## # A tibble: 46,818 x 4 ## author word n total ## <chr> <chr> <int> <int> ## 1 "Marx, Karl" money 834 34820 ## 2 "Rowe, Henry K. (Henry Kalloch)" social 829 52553 ## 3 "Blackmar, Frank W. (Frank Wilson)" life 737 71571 ## 4 "Ellwood, Charles A. (Charles Abram)" social 649 33522 ## 5 "Marx, Karl" gold 649 34820 ## 6 "Marx, Karl" commodities 621 34820 ## 7 "Durkheim, \u00c9mile" religious 598 77503 ## 8 "Marx, Karl" circulation 562 34820 ## 9 "Marx, Karl" exchange 545 34820 ## 10 "Blackmar, Frank W. (Frank Wilson)" people 536 71571 ## # ... with 46,808 more rows ``` --- # Viz word occurences per book (a) ```r words_sociology_work %>% filter (n >= 333) %>% mutate(word = reorder(word, n)) %>% ggplot(aes(x = word, y = n, fill = author)) + geom_col() + coord_flip() ``` --- # Viz word occurences per book (b) <img src="data:image/png;base64,#R-course-ginnerskov-part3-2022_files/figure-html/unnamed-chunk-18-1.png" width="80%" style="display: block; margin: auto;" /> --- # Comparing books via td-idf ```r sociology_tf_idf <- tidy_sociology %>% count(author, word, sort = TRUE) %>% bind_tf_idf(word, author, n) %>% arrange(-tf_idf) %>% group_by(author) %>% top_n(15) %>% ungroup ``` ``` ## # A tibble: 93 x 6 ## author word n tf idf tf_idf ## <chr> <chr> <int> <dbl> <dbl> <dbl> ## 1 "Marx, Karl" commodity 434 0.0125 1.79 0.0223 ## 2 "Marx, Karl" gold 649 0.0186 0.693 0.0129 ## 3 "Marx, Karl" commodities 621 0.0178 0.693 0.0124 ## 4 "Marx, Karl" silver 257 0.00738 1.10 0.00811 ## 5 "Durkheim, \u00c9mile" totem 496 0.00640 1.10 0.00703 ## 6 "Marx, Karl" labor 523 0.0150 0.405 0.00609 ## 7 "Durkheim, \u00c9mile" totemic 347 0.00448 1.10 0.00492 ## 8 "Geddes, Patrick, Sir" civics 68 0.00434 1.10 0.00477 ## 9 "Durkheim, \u00c9mile" gillen 205 0.00265 1.79 0.00474 ## 10 "Durkheim, \u00c9mile" strehlow 203 0.00262 1.79 0.00469 ## # ... with 83 more rows ``` --- # Viz books via td-idf (a) ```r sociology_tf_idf %>% mutate(word = reorder_within(word, tf_idf, author)) %>% ggplot(aes(word, tf_idf, fill = author)) + geom_col(alpha = 0.8, show.legend = FALSE) + facet_wrap(~ author, scales = "free", ncol = 3) + scale_x_reordered() + coord_flip() + theme(strip.text=element_text(size=11)) ``` --- # Viz books via td-idf (b) <img src="data:image/png;base64,#R-course-ginnerskov-part3-2022_files/figure-html/unnamed-chunk-22-1.png" width="80%" style="display: block; margin: auto;" /> --- # Calculating word correlations ```r library(widyr) cors_sociology <- words_sociology_work %>% group_by(word) %>% filter(n >= 250) %>% pairwise_cor(word, author, sort = TRUE, upper = FALSE) ``` ``` ## # A tibble: 861 x 3 ## item1 item2 correlation ## <chr> <chr> <dbl> ## 1 social life 1 ## 2 money gold 1 ## 3 money commodities 1 ## 4 gold commodities 1 ## 5 money circulation 1 ## 6 gold circulation 1 ## 7 commodities circulation 1 ## 8 money exchange 1 ## 9 gold exchange 1 ## 10 commodities exchange 1 ## # ... with 851 more rows ``` --- # Word correlations -> word graph ```r library(igraph) set.seed(1234) g_keywords <- cors_sociology %>% filter(correlation > .3) %>% graph_from_data_frame() ``` ``` ## IGRAPH 2096f20 DN-- 42 249 -- ## + attr: name (v/c), correlation (e/n) ## + edges from 2096f20 (vertex names): ## [1] social ->life money ->gold ## [3] money ->commodities gold ->commodities ## [5] money ->circulation gold ->circulation ## [7] commodities ->circulation money ->exchange ## [9] gold ->exchange commodities ->exchange ## [11] circulation ->exchange money ->labor ## [13] gold ->labor commodities ->labor ## [15] circulation ->labor exchange ->labor ## + ... omitted several edges ``` --- # Viz the word graph (a) ```r library(ggraph) g_keywords %>% ggraph(layout = "fr") + geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), edge_colour = "orange") + geom_node_point(size = 0.5*igraph::degree(g_keywords), colour = "lightblue") + geom_node_text(aes(label = name), repel = TRUE) + theme_void() ``` --- # Viz the word graph (b) <img src="data:image/png;base64,#R-course-ginnerskov-part3-2022_files/figure-html/unnamed-chunk-28-1.png" width="80%" style="display: block; margin: auto;" /> --- # Preparing for topic modeling ```r sociology_dtm <- words_sociology_work %>% cast_dtm(author, word, n) ``` ``` ## <<DocumentTermMatrix (documents: 6, terms: 23308)>> ## Non-/sparse entries: 46818/93030 ## Sparsity : 67% ## Maximal term length: 23 ## Weighting : term frequency (tf) ``` --- # Run a topic model ```r library(topicmodels) sociology_lda <- LDA(sociology_dtm, k = 4, control = list(seed = 1234)) ``` ``` ## A LDA_VEM topic model with 4 topics. ``` --- # Inspecting the beta (word/topic prob) ```r sociology_topics <- tidy(sociology_lda, matrix = "beta") ``` ``` ## # A tibble: 93,232 x 3 ## topic term beta ## <int> <chr> <dbl> ## 1 1 money 5.38e- 4 ## 2 2 money 2.39e- 2 ## 3 3 money 2.27e- 4 ## 4 4 money 5.77e-56 ## 5 1 social 8.80e- 3 ## 6 2 social 4.24e- 3 ## 7 3 social 1.78e- 2 ## 8 4 social 3.32e- 3 ## 9 1 life 9.22e- 3 ## 10 2 life 6.31e- 4 ## # ... with 93,222 more rows ``` --- # Top terms in topic model beta ```r top_terms <- sociology_topics %>% group_by(topic) %>% slice_max(beta, n = 10) %>% ungroup() %>% arrange(topic, -beta) ``` ``` ## # A tibble: 41 x 3 ## topic term beta ## <int> <chr> <dbl> ## 1 1 life 0.00922 ## 2 1 social 0.00880 ## 3 1 people 0.00743 ## 4 1 government 0.00660 ## 5 1 progress 0.00465 ## 6 1 civilization 0.00456 ## 7 1 development 0.00422 ## 8 1 church 0.00375 ## 9 1 time 0.00370 ## 10 1 power 0.00362 ## # ... with 31 more rows ``` --- # Viz topic model beta (a) ```r library(ggplot2) top_terms %>% mutate(term = reorder_within(term, beta, topic)) %>% ggplot(aes(beta, term, fill = factor(topic))) + geom_col(show.legend = FALSE) + facet_wrap(~ topic, scales = "free") + scale_y_reordered() ``` --- # Viz topic model beta (b) <img src="data:image/png;base64,#R-course-ginnerskov-part3-2022_files/figure-html/unnamed-chunk-38-1.png" width="80%" style="display: block; margin: auto;" /> --- # Inspecting the gamma (doc/topic prob) ```r sociology_gamma <- tidy(sociology_lda, matrix = "gamma") ``` ``` ## # A tibble: 24 x 3 ## document topic gamma ## <chr> <int> <dbl> ## 1 "Marx, Karl" 1 0.000000760 ## 2 "Rowe, Henry K. (Henry Kalloch)" 1 0.785 ## 3 "Blackmar, Frank W. (Frank Wilson)" 1 1.00 ## 4 "Ellwood, Charles A. (Charles Abram)" 1 0.000000789 ## 5 "Durkheim, \u00c9mile" 1 0.000000341 ## 6 "Geddes, Patrick, Sir" 1 0.00000169 ## 7 "Marx, Karl" 2 1.00 ## 8 "Rowe, Henry K. (Henry Kalloch)" 2 0.000000503 ## 9 "Blackmar, Frank W. (Frank Wilson)" 2 0.000000370 ## 10 "Ellwood, Charles A. (Charles Abram)" 2 0.000000789 ## # ... with 14 more rows ``` --- # Viz topic model gamma (a) ```r sociology_gamma %>% mutate(document = reorder(document, gamma * topic)) %>% ggplot(aes(factor(topic), gamma)) + geom_boxplot() + facet_wrap(~ document) + labs(x = "topic", y = expression(gamma)) ``` --- # Viz topic model gamma (b) <img src="data:image/png;base64,#R-course-ginnerskov-part3-2022_files/figure-html/unnamed-chunk-42-1.png" width="80%" style="display: block; margin: auto;" /> --- # Adding sentiments to your tibble ```r bing <- get_sentiments("bing") tidy_sociology %>% inner_join(bing) %>% count(word, sentiment, sort = TRUE) %>% top_n(10) ``` ``` ## # A tibble: 10 x 3 ## word sentiment n ## <chr> <chr> <int> ## 1 gold positive 664 ## 2 progress positive 630 ## 3 modern positive 502 ## 4 primitive negative 414 ## 5 object negative 305 ## 6 crime negative 262 ## 7 death negative 222 ## 8 poverty negative 185 ## 9 struggle negative 180 ## 10 free positive 178 ``` --- # Viz top sentiments (a) ```r sentiment_counts <- tidy_sociology %>% inner_join(bing) %>% count(word, sentiment, sort = TRUE) sentiment_counts %>% filter(n > 100) %>% mutate(n = ifelse(sentiment == "negative", -n, n)) %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word, n, fill = sentiment)) + geom_col() + coord_flip() + labs(y = "Contribution to sentiment") ``` --- # Viz top sentiments (b) <img src="data:image/png;base64,#R-course-ginnerskov-part3-2022_files/figure-html/unnamed-chunk-46-1.png" width="80%" style="display: block; margin: auto;" /> --- # The sentiment narrative unfolds (a) ```r library(tidyr) sentisociology <- tidy_sociology %>% inner_join(bing) %>% count(author, index = line %/% 100, sentiment) %>% spread(sentiment, n, fill = 0) %>% mutate(sentiment = positive - negative) ``` ```r library(ggplot2) ggplot(sentisociology, aes(index, sentiment, fill = author)) + geom_bar(stat = "identity", show.legend = FALSE) + facet_wrap(~author, ncol = 2, scales = "free_x") ``` --- # The sentiment narrative unfolds (b) <img src="data:image/png;base64,#R-course-ginnerskov-part3-2022_files/figure-html/unnamed-chunk-50-1.png" width="80%" style="display: block; margin: auto;" /> --- # Thank you for your time! ## Do not hesitate to contact me | | | |:---------------------------------------------------------------------------------------------|:-------------------------| | <a href="mailto:josef.ginnerskov@soc.uu.se">.UUred[<i class="fa fa-paper-plane fa-fw"></i>] |josef.ginnerskov@soc.uu.se | | <a href="http://twitter.com/doeparen">.UUred[<i class="fa fa-twitter fa-fw"></i>] |@doeparen | | <a href="http://github.com/doeparen">.UUred[<i class="fa fa-gitlab fa-fw"></i>] |@doeparen |